library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaflet)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readxl)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Read in wine data.

wine_df = 
  read_csv(
  "./wine_data/tidy/wine_all.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X1_1 = col_double(),
##   country = col_character(),
##   description = col_character(),
##   designation = col_character(),
##   points = col_double(),
##   price = col_double(),
##   province = col_character(),
##   region_1 = col_character(),
##   taster_name = col_character(),
##   title = col_character(),
##   variety = col_character(),
##   winery = col_character(),
##   year = col_double(),
##   type = col_character()
## )
### remove region 2, taster twitter and missing values in region 1.

#wine_type <- read_csv("./wine_data/winemag-data-130k-v2.csv") %>% 
#            group_by(variety) %>% 
#            count() %>% 
#            arrange(desc(n)) %>% 
#            as.tibble()

Make a plot of distribution of price/rating by type

wine_df %>% 
  filter(!is.na(type),
         price <= 200) %>% 
  ggplot(aes(x = type, y = price, color = type)) +
  geom_violin()

wine_df %>% 
  filter(!is.na(type)) %>% 
  rename(rating = points) %>% 
  ggplot(aes(x = type, y = rating, color = type)) +
  geom_violin()

Make a plot of distribution of price/rating by region

y <- list(
  title = "Mean Price"
)
wine_df %>% 
  filter(!is.na(price)) %>% 
  group_by(country) %>% 
  summarise(mean = mean(price)) %>% 
  mutate(country = fct_reorder(country, mean),
         mean = round(mean, 2),
        text_label=str_c("Country:", country, "\nmean price:", mean)) %>% 
  plot_ly(
  x = ~country, y = ~mean, color = ~country, text = ~text_label, 
  type = "bar", colors = "viridis") %>% 
  layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: Ignoring 1 observations
y <- list(
  title = "Mean rating"
)
wine_df %>% 
  filter(!is.na(points)) %>% 
  group_by(country) %>% 
  summarise(mean = mean(points)) %>% 
  mutate(country = fct_reorder(country, mean),
         mean = round(mean, 2),
        text_label=str_c("Country:", country, "\nmean rating:", mean)) %>% 
  plot_ly(
  x = ~country, y = ~mean, color = ~country, text = ~text_label, 
  type = "bar", colors = "viridis") %>% 
  layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring 1 observations

Even though Switzerland has the highest priced wine, the average ratings is only ranked at 11.

Make plots of ratings/price by year by wine type.

wine_df %>% 
  filter(!is.na(points),
         !is.na(type),
         year > 1900) %>% 
  group_by(year,type) %>% 
  summarise(mean = mean(points)) %>% 
  mutate(mean = round(mean, 2)) %>% 
  ggplot(aes(x = year, y = mean, color = type))+
  geom_line() +
  labs(y = "Mean rating",
      x = "Year",
      title = "change in mean ratings by wine type and year")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

wine_df %>% 
  filter(!is.na(price),
         !is.na(type),
         year > 1900) %>% 
  group_by(year,type) %>% 
  summarise(mean = mean(price)) %>% 
  mutate(mean = round(mean, 2)) %>% 
  ggplot(aes(x = year, y = mean, color = type))+
  geom_point()+
  geom_line() +
  labs(y = "Mean price",
      x = "Year",
      title = "change in mean price by wine type in 1900-2017")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

wine_df %>% 
  filter(!is.na(price),
         !is.na(type),
         year > 2000) %>% 
  group_by(year,type) %>% 
  summarise(mean = mean(price)) %>% 
  mutate(mean = round(mean, 2)) %>% 
  ggplot(aes(x = year, y = mean, color = type))+
  geom_point()+
  geom_line() +
  labs(y = "Mean price",
      x = "Year",
      title = "change in mean price by wine type in 21 century")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

Make plots of ratings by taster.

y <- list(
  title = "Mean ratings"
)
wine_df %>% 
  filter(!is.na(points)) %>% 
  group_by(taster_name) %>% 
  summarise(mean = mean(points)) %>% 
  mutate(taster_name = fct_reorder(taster_name, mean),
         mean = round(mean, 2),
        text_label=str_c("Taster:", taster_name, "\nmean rating:", mean)) %>% 
  plot_ly(
  x = ~taster_name, y = ~mean, color = ~taster_name, text = ~text_label, 
  type = "bar", colors = "viridis") %>% 
  layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring 1 observations

who rates the highest price wine? (greater than $800)

wine_df %>% 
  filter(price >= 800,
         !is.na(taster_name)) %>% 
  group_by(taster_name) %>% 
  count()
## # A tibble: 5 x 2
## # Groups:   taster_name [5]
##   taster_name          n
##   <chr>            <int>
## 1 Anne Krebiehl MW     1
## 2 Joe Czerwinski       3
## 3 Kerin O’Keefe        1
## 4 Matt Kettmann        1
## 5 Roger Voss          24

Most of the most expensive wine are rated by Roger Voss.

wine_df %>% 
  filter(!is.na(taster_name)) %>% 
  group_by(taster_name) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  knitr::kable()
taster_name n
Roger Voss 25514
Michael Schachner 15134
Kerin O’Keefe 10776
Virginie Boone 9537
Paul Gregutt 9532
Matt Kettmann 6332
Joe Czerwinski 5147
Sean P. Sullivan 4966
Anna Lee C. Iijima 4415
Jim Gordon 4177
Anne Krebiehl MW 3685
Lauren Buzzeo 1835
Susan Kostrzewa 1085
Mike DeSimone 514
Jeff Jenssen 491
Alexander Peartree 415
Carrie Dykes 139
Fiona Adams 27
Christina Pickard 6

Roger Voss rates most number of wine at WineEthusiast Magazine.

Get the winery which has the top average ratings wine.

wine_df %>% 
  filter(!is.na(winery)) %>% 
  group_by(winery,country) %>% 
  summarise(mean = mean(points)) %>% 
  arrange(desc(mean)) %>% 
  top_n(10)
## `summarise()` regrouping output by 'winery' (override with `.groups` argument)
## Selecting by mean
## # A tibble: 16,961 x 3
## # Groups:   winery [16,757]
##    winery                country    mean
##    <chr>                 <chr>     <dbl>
##  1 Araujo                US         98  
##  2 Gandona               US         97  
##  3 J.L. Chave            France     97  
##  4 Ovid                  US         97  
##  5 Standish              Australia  97  
##  6 Salon                 France     96.8
##  7 Tenuta dell'Ornellaia Italy      96.7
##  8 Château Pétrus        France     96.7
##  9 Barons de Rothschild  France     96  
## 10 Bryant Family         US         96  
## # … with 16,951 more rows